{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Emotion Clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from kneed import KneeLocator\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.metrics import silhouette_score\n",
    "\n",
    "df = pd.read_csv('', low_memory=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "colNames = ['anger_Intensity_perc', 'fear_Intensity_perc', 'liking_Intensity_perc', 'disgust_Intensity_perc', 'joy_Intensity_perc', 'sadness_Intensity_perc', 'anticipation_Intensity_perc', 'surprise_Intensity_perc']\n",
    "\n",
    "df1 = df[colNames].astype(pd.Float64Dtype())\n",
    "emo_arr = df1.to_numpy()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kmeans_kwargs = {\n",
    "    \"init\": \"random\",\n",
    "    \"n_init\": 10,\n",
    "    \"max_iter\": 300,\n",
    "    \"random_state\": None,\n",
    "}\n",
    "\n",
    "# A list holds the SSE values for each k\n",
    "sse = []\n",
    "for k in range(1, 11):\n",
    "    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n",
    "    kmeans.fit(emo_arr)\n",
    "    sse.append(kmeans.inertia_)\n",
    "\n",
    "plt.style.use(\"fivethirtyeight\")\n",
    "plt.plot(range(1, 11), sse)\n",
    "plt.xticks(range(1, 11))\n",
    "plt.xlabel(\"Number of Clusters\")\n",
    "plt.ylabel(\"SSE\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kl = KneeLocator(\n",
    "    range(1, 11), sse, curve=\"convex\", direction=\"decreasing\"\n",
    ")\n",
    "\n",
    "kl.elbow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A list holds the silhouette coefficients for each k\n",
    "silhouette_coefficients = []\n",
    "\n",
    "# Notice you start at 2 clusters for silhouette coefficient\n",
    "for k in range(2, 11):\n",
    "    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)\n",
    "    kmeans.fit(emo_arr)\n",
    "    score = silhouette_score(emo_arr, kmeans.labels_)\n",
    "    silhouette_coefficients.append(score)\n",
    "\n",
    "plt.style.use(\"fivethirtyeight\")\n",
    "plt.plot(range(2, 11), silhouette_coefficients)\n",
    "plt.xticks(range(2, 11))\n",
    "plt.xlabel(\"Number of Clusters\")\n",
    "plt.ylabel(\"Silhouette Coefficient\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate cluster labels ('clusterLab') and distance from centroid ('distance_centroid') for each song\n",
    "\n",
    "kmeans = KMeans(\n",
    "    init=\"random\",\n",
    "    n_clusters=4,\n",
    "    n_init=10,\n",
    "    max_iter=300,\n",
    "    random_state= 45\n",
    ")\n",
    "\n",
    "kmeans.fit(emo_arr)\n",
    "\n",
    "# The lowest SSE value\n",
    "print(kmeans.inertia_)\n",
    "\n",
    "# Final locations of the centroid\n",
    "print(kmeans.cluster_centers_)\n",
    "\n",
    "# The number of iterations required to converge\n",
    "print(kmeans.n_iter_)\n",
    "\n",
    "print(kmeans.labels_[:15])\n",
    "# cluster_labs = pd.DataFrame(kmeans.labels_, columns=['clusterLab'])\n",
    "\n",
    "cluster_labs = pd.DataFrame(kmeans.labels_, columns=['clusterLab_Intensity'])\n",
    "\n",
    "df1 = pd.merge(df, cluster_labs, left_index=True, right_index=True)\n",
    "\n",
    "distances = kmeans.transform(emo_arr)\n",
    "min_distances = distances.min(axis=1)\n",
    "\n",
    "# min_distances = pd.DataFrame(min_distances, columns=['distance_centroid'])\n",
    "\n",
    "min_distances = pd.DataFrame(min_distances, columns=['distance_centroid_Intensity'])\n",
    "\n",
    "df1 = pd.merge(df1, min_distances, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "     anger      fear       liking     disgust    joy        sadness    anticip    surprise\n",
    " 0: [0.00793729 0.01711579 0.43089512 0.03973319 0.40138208 0.0941459  0.08545268 0.00879063]  liking-joy\n",
    " 1: [0.03578495 0.05541088 0.12408174 0.38524972 0.06678249 0.32273822 0.05318213 0.009952  ]  disgust-sadness-liking\n",
    " 2: [0.00671334 0.01045805 0.21578515 0.07292762 0.06051232 0.632377   0.31008125 0.00122653]  sadness-anticip-liking\n",
    "\n",
    "Count Percentage:\n",
    "      anger      fear       liking     disgust    joy        sadness    anticip    surprise\n",
    " 1: [0.00694415 0.01822783 0.54055723 0.03951037 0.20931848 0.18038735  0.15283691 0.00505459]  liking>>joy>sadness>anticip\n",
    " 2: [0.00970358 0.01455514 0.28868302 0.05355924 0.56833728 0.05498773  0.03940203 0.01017401]  joy>>liking\n",
    " 3: [0.00721937 0.01139597 0.1850879  0.07867888 0.05602888 0.65930071  0.31583616 0.00228827]  sadness>>anticip>>liking\n",
    " 0: [0.0371478  0.0535413  0.11107553 0.40315477 0.06536692 0.31955346  0.04968316 0.01016023]  disgust>sadness>>liking\n",
    "\n",
    "Intensity Percentage:\n",
    "      anger      fear       liking     disgust    joy        sadness    anticip    surprise\n",
    " 0: [0.04721393 0.05462736 0.10020475 0.44751062 0.06338764 0.27853998  0.03461098 0.00851572]  disgust>>sadness>>liking\n",
    " 1: [0.00891785 0.01231273 0.27892975 0.04435246 0.5837455  0.06280159  0.04357636 0.00894011]  joy>>liking\n",
    " 2: [0.00555592 0.01498476 0.57175782 0.0453448  0.1859476  0.17190283  0.14719702 0.00450627]  liking>>joy>sadness>anticip\n",
    " 3: [0.0066258  0.01393328 0.19067233 0.08881245 0.05532933 0.64271089  0.25915811 0.00191593]  sadness>>anticip>liking\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the mapping\n",
    "\n",
    "mapping = {2: 'liking>>joy>sadness>anticip', 1: 'joy>>liking', 3: 'sadness>>anticip>liking', 0: 'disgust>>sadness>>liking'}\n",
    "\n",
    "df1['clusterLab_Intensity'] = df1['clusterLab_Intensity'].replace(mapping)\n",
    "\n",
    "df1.to_csv('Final Datasets/df7_3EmotionsCH_split_synonym_expanded_emo scores_narrow.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
